import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
# create the dataframe
df = pd.read_csv('online_news_popularity.csv')
# set max rows to display
pd.set_option('display.max_rows', None)
# set max columns to display
pd.set_option('display.max_columns', None)
# show the first 5 rows
df.head()
| url | timedelta | n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | num_videos | average_token_length | num_keywords | data_channel_is_lifestyle | data_channel_is_entertainment | data_channel_is_bus | data_channel_is_socmed | data_channel_is_tech | data_channel_is_world | kw_min_min | kw_max_min | kw_avg_min | kw_min_max | kw_max_max | kw_avg_max | kw_min_avg | kw_max_avg | kw_avg_avg | self_reference_min_shares | self_reference_max_shares | self_reference_avg_sharess | weekday_is_monday | weekday_is_tuesday | weekday_is_wednesday | weekday_is_thursday | weekday_is_friday | weekday_is_saturday | weekday_is_sunday | is_weekend | LDA_00 | LDA_01 | LDA_02 | LDA_03 | LDA_04 | global_subjectivity | global_sentiment_polarity | global_rate_positive_words | global_rate_negative_words | rate_positive_words | rate_negative_words | avg_positive_polarity | min_positive_polarity | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | shares | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | http://mashable.com/2013/01/07/amazon-instant-... | 731.0 | 12.0 | 219.0 | 0.663594 | 1.0 | 0.815385 | 4.0 | 2.0 | 1.0 | 0.0 | 4.680365 | 5.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 496.0 | 496.0 | 496.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500331 | 0.378279 | 0.040005 | 0.041263 | 0.040123 | 0.521617 | 0.092562 | 0.045662 | 0.013699 | 0.769231 | 0.230769 | 0.378636 | 0.100000 | 0.7 | -0.350000 | -0.600 | -0.200000 | 0.500000 | -0.187500 | 0.000000 | 0.187500 | 593 |
| 1 | http://mashable.com/2013/01/07/ap-samsung-spon... | 731.0 | 9.0 | 255.0 | 0.604743 | 1.0 | 0.791946 | 3.0 | 1.0 | 1.0 | 0.0 | 4.913725 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.799756 | 0.050047 | 0.050096 | 0.050101 | 0.050001 | 0.341246 | 0.148948 | 0.043137 | 0.015686 | 0.733333 | 0.266667 | 0.286915 | 0.033333 | 0.7 | -0.118750 | -0.125 | -0.100000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 711 |
| 2 | http://mashable.com/2013/01/07/apple-40-billio... | 731.0 | 9.0 | 211.0 | 0.575130 | 1.0 | 0.663866 | 3.0 | 1.0 | 1.0 | 0.0 | 4.393365 | 6.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 918.0 | 918.0 | 918.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.217792 | 0.033334 | 0.033351 | 0.033334 | 0.682188 | 0.702222 | 0.323333 | 0.056872 | 0.009479 | 0.857143 | 0.142857 | 0.495833 | 0.100000 | 1.0 | -0.466667 | -0.800 | -0.133333 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1500 |
| 3 | http://mashable.com/2013/01/07/astronaut-notre... | 731.0 | 9.0 | 531.0 | 0.503788 | 1.0 | 0.665635 | 9.0 | 0.0 | 1.0 | 0.0 | 4.404896 | 7.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028573 | 0.419300 | 0.494651 | 0.028905 | 0.028572 | 0.429850 | 0.100705 | 0.041431 | 0.020716 | 0.666667 | 0.333333 | 0.385965 | 0.136364 | 0.8 | -0.369697 | -0.600 | -0.166667 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1200 |
| 4 | http://mashable.com/2013/01/07/att-u-verse-apps/ | 731.0 | 13.0 | 1072.0 | 0.415646 | 1.0 | 0.540890 | 19.0 | 19.0 | 20.0 | 0.0 | 4.682836 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 545.0 | 16000.0 | 3151.157895 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028633 | 0.028794 | 0.028575 | 0.028572 | 0.885427 | 0.513502 | 0.281003 | 0.074627 | 0.012127 | 0.860215 | 0.139785 | 0.411127 | 0.033333 | 1.0 | -0.220192 | -0.500 | -0.050000 | 0.454545 | 0.136364 | 0.045455 | 0.136364 | 505 |
# shape of the dataframe
rows, columns = df.shape
print("Number of rows: ", rows)
print("Number of columns: ", columns)
Number of rows: 39644 Number of columns: 61
# types of the columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 39644 entries, 0 to 39643 Data columns (total 61 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 url 39644 non-null object 1 timedelta 39644 non-null float64 2 n_tokens_title 39644 non-null float64 3 n_tokens_content 39644 non-null float64 4 n_unique_tokens 39644 non-null float64 5 n_non_stop_words 39644 non-null float64 6 n_non_stop_unique_tokens 39644 non-null float64 7 num_hrefs 39644 non-null float64 8 num_self_hrefs 39644 non-null float64 9 num_imgs 39644 non-null float64 10 num_videos 39644 non-null float64 11 average_token_length 39644 non-null float64 12 num_keywords 39644 non-null float64 13 data_channel_is_lifestyle 39644 non-null float64 14 data_channel_is_entertainment 39644 non-null float64 15 data_channel_is_bus 39644 non-null float64 16 data_channel_is_socmed 39644 non-null float64 17 data_channel_is_tech 39644 non-null float64 18 data_channel_is_world 39644 non-null float64 19 kw_min_min 39644 non-null float64 20 kw_max_min 39644 non-null float64 21 kw_avg_min 39644 non-null float64 22 kw_min_max 39644 non-null float64 23 kw_max_max 39644 non-null float64 24 kw_avg_max 39644 non-null float64 25 kw_min_avg 39644 non-null float64 26 kw_max_avg 39644 non-null float64 27 kw_avg_avg 39644 non-null float64 28 self_reference_min_shares 39644 non-null float64 29 self_reference_max_shares 39644 non-null float64 30 self_reference_avg_sharess 39644 non-null float64 31 weekday_is_monday 39644 non-null float64 32 weekday_is_tuesday 39644 non-null float64 33 weekday_is_wednesday 39644 non-null float64 34 weekday_is_thursday 39644 non-null float64 35 weekday_is_friday 39644 non-null float64 36 weekday_is_saturday 39644 non-null float64 37 weekday_is_sunday 39644 non-null float64 38 is_weekend 39644 non-null float64 39 LDA_00 39644 non-null float64 40 LDA_01 39644 non-null float64 41 LDA_02 39644 non-null float64 42 LDA_03 39644 non-null float64 43 LDA_04 39644 non-null float64 44 global_subjectivity 39644 non-null float64 45 global_sentiment_polarity 39644 non-null float64 46 global_rate_positive_words 39644 non-null float64 47 global_rate_negative_words 39644 non-null float64 48 rate_positive_words 39644 non-null float64 49 rate_negative_words 39644 non-null float64 50 avg_positive_polarity 39644 non-null float64 51 min_positive_polarity 39644 non-null float64 52 max_positive_polarity 39644 non-null float64 53 avg_negative_polarity 39644 non-null float64 54 min_negative_polarity 39644 non-null float64 55 max_negative_polarity 39644 non-null float64 56 title_subjectivity 39644 non-null float64 57 title_sentiment_polarity 39644 non-null float64 58 abs_title_subjectivity 39644 non-null float64 59 abs_title_sentiment_polarity 39644 non-null float64 60 shares 39644 non-null int64 dtypes: float64(59), int64(1), object(1) memory usage: 18.5+ MB
# descibe the dataframe
df.describe()
| timedelta | n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | num_videos | average_token_length | num_keywords | data_channel_is_lifestyle | data_channel_is_entertainment | data_channel_is_bus | data_channel_is_socmed | data_channel_is_tech | data_channel_is_world | kw_min_min | kw_max_min | kw_avg_min | kw_min_max | kw_max_max | kw_avg_max | kw_min_avg | kw_max_avg | kw_avg_avg | self_reference_min_shares | self_reference_max_shares | self_reference_avg_sharess | weekday_is_monday | weekday_is_tuesday | weekday_is_wednesday | weekday_is_thursday | weekday_is_friday | weekday_is_saturday | weekday_is_sunday | is_weekend | LDA_00 | LDA_01 | LDA_02 | LDA_03 | LDA_04 | global_subjectivity | global_sentiment_polarity | global_rate_positive_words | global_rate_negative_words | rate_positive_words | rate_negative_words | avg_positive_polarity | min_positive_polarity | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | shares | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 | 39644.000000 |
| mean | 354.530471 | 10.398749 | 546.514731 | 0.548216 | 0.996469 | 0.689175 | 10.883690 | 3.293638 | 4.544143 | 1.249874 | 4.548239 | 7.223767 | 0.052946 | 0.178009 | 0.157855 | 0.058597 | 0.185299 | 0.212567 | 26.106801 | 1153.951682 | 312.366967 | 13612.354102 | 752324.066694 | 259281.938083 | 1117.146610 | 5657.211151 | 3135.858639 | 3998.755396 | 10329.212662 | 6401.697580 | 0.168020 | 0.186409 | 0.187544 | 0.183306 | 0.143805 | 0.061876 | 0.069039 | 0.130915 | 0.184599 | 0.141256 | 0.216321 | 0.223770 | 0.234029 | 0.443370 | 0.119309 | 0.039625 | 0.016612 | 0.682150 | 0.287934 | 0.353825 | 0.095446 | 0.756728 | -0.259524 | -0.521944 | -0.107500 | 0.282353 | 0.071425 | 0.341843 | 0.156064 | 3395.380184 |
| std | 214.163767 | 2.114037 | 471.107508 | 3.520708 | 5.231231 | 3.264816 | 11.332017 | 3.855141 | 8.309434 | 4.107855 | 0.844406 | 1.909130 | 0.223929 | 0.382525 | 0.364610 | 0.234871 | 0.388545 | 0.409129 | 69.633215 | 3857.990877 | 620.783887 | 57986.029357 | 214502.129573 | 135102.247285 | 1137.456951 | 6098.871957 | 1318.150397 | 19738.670516 | 41027.576613 | 24211.332231 | 0.373889 | 0.389441 | 0.390353 | 0.386922 | 0.350896 | 0.240933 | 0.253524 | 0.337312 | 0.262975 | 0.219707 | 0.282145 | 0.295191 | 0.289183 | 0.116685 | 0.096931 | 0.017429 | 0.010828 | 0.190206 | 0.156156 | 0.104542 | 0.071315 | 0.247786 | 0.127726 | 0.290290 | 0.095373 | 0.324247 | 0.265450 | 0.188791 | 0.226294 | 11626.950749 |
| min | 8.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.000000 | 0.000000 | -1.000000 | 0.000000 | 0.000000 | 0.000000 | -1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.393750 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.000000 | -1.000000 | -1.000000 | 0.000000 | -1.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 164.000000 | 9.000000 | 246.000000 | 0.470870 | 1.000000 | 0.625739 | 4.000000 | 1.000000 | 1.000000 | 0.000000 | 4.478404 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.000000 | 445.000000 | 141.750000 | 0.000000 | 843300.000000 | 172846.875000 | 0.000000 | 3562.101631 | 2382.448566 | 639.000000 | 1100.000000 | 981.187500 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.025051 | 0.025012 | 0.028571 | 0.028571 | 0.028574 | 0.396167 | 0.057757 | 0.028384 | 0.009615 | 0.600000 | 0.185185 | 0.306244 | 0.050000 | 0.600000 | -0.328383 | -0.700000 | -0.125000 | 0.000000 | 0.000000 | 0.166667 | 0.000000 | 946.000000 |
| 50% | 339.000000 | 10.000000 | 409.000000 | 0.539226 | 1.000000 | 0.690476 | 8.000000 | 3.000000 | 1.000000 | 0.000000 | 4.664082 | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.000000 | 660.000000 | 235.500000 | 1400.000000 | 843300.000000 | 244572.222223 | 1023.635611 | 4355.688836 | 2870.074878 | 1200.000000 | 2800.000000 | 2200.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.033387 | 0.033345 | 0.040004 | 0.040001 | 0.040727 | 0.453457 | 0.119117 | 0.039023 | 0.015337 | 0.710526 | 0.280000 | 0.358755 | 0.100000 | 0.800000 | -0.253333 | -0.500000 | -0.100000 | 0.150000 | 0.000000 | 0.500000 | 0.000000 | 1400.000000 |
| 75% | 542.000000 | 12.000000 | 716.000000 | 0.608696 | 1.000000 | 0.754630 | 14.000000 | 4.000000 | 4.000000 | 1.000000 | 4.854839 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 1000.000000 | 357.000000 | 7900.000000 | 843300.000000 | 330980.000000 | 2056.781032 | 6019.953968 | 3600.229564 | 2600.000000 | 8000.000000 | 5200.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.240958 | 0.150831 | 0.334218 | 0.375763 | 0.399986 | 0.508333 | 0.177832 | 0.050279 | 0.021739 | 0.800000 | 0.384615 | 0.411428 | 0.100000 | 1.000000 | -0.186905 | -0.300000 | -0.050000 | 0.500000 | 0.150000 | 0.500000 | 0.250000 | 2800.000000 |
| max | 731.000000 | 23.000000 | 8474.000000 | 701.000000 | 1042.000000 | 650.000000 | 304.000000 | 116.000000 | 128.000000 | 91.000000 | 8.041534 | 10.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 377.000000 | 298400.000000 | 42827.857143 | 843300.000000 | 843300.000000 | 843300.000000 | 3613.039819 | 298400.000000 | 43567.659946 | 843300.000000 | 843300.000000 | 843300.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.926994 | 0.925947 | 0.919999 | 0.926534 | 0.927191 | 1.000000 | 0.727841 | 0.155488 | 0.184932 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.500000 | 1.000000 | 843300.000000 |
# check for missing values
print(df.isnull().sum())
url 0 timedelta 0 n_tokens_title 0 n_tokens_content 0 n_unique_tokens 0 n_non_stop_words 0 n_non_stop_unique_tokens 0 num_hrefs 0 num_self_hrefs 0 num_imgs 0 num_videos 0 average_token_length 0 num_keywords 0 data_channel_is_lifestyle 0 data_channel_is_entertainment 0 data_channel_is_bus 0 data_channel_is_socmed 0 data_channel_is_tech 0 data_channel_is_world 0 kw_min_min 0 kw_max_min 0 kw_avg_min 0 kw_min_max 0 kw_max_max 0 kw_avg_max 0 kw_min_avg 0 kw_max_avg 0 kw_avg_avg 0 self_reference_min_shares 0 self_reference_max_shares 0 self_reference_avg_sharess 0 weekday_is_monday 0 weekday_is_tuesday 0 weekday_is_wednesday 0 weekday_is_thursday 0 weekday_is_friday 0 weekday_is_saturday 0 weekday_is_sunday 0 is_weekend 0 LDA_00 0 LDA_01 0 LDA_02 0 LDA_03 0 LDA_04 0 global_subjectivity 0 global_sentiment_polarity 0 global_rate_positive_words 0 global_rate_negative_words 0 rate_positive_words 0 rate_negative_words 0 avg_positive_polarity 0 min_positive_polarity 0 max_positive_polarity 0 avg_negative_polarity 0 min_negative_polarity 0 max_negative_polarity 0 title_subjectivity 0 title_sentiment_polarity 0 abs_title_subjectivity 0 abs_title_sentiment_polarity 0 shares 0 dtype: int64
Here, we can see that we don't have any missing values in our dataset. So, let's continue with the data cleaning.
# set 10 rows to display
pd.set_option('display.max_rows', 10)
Let's see if some data doesn't make sense at all. Are the articles with 0 words or no title really articles?
# show the rows with ' n_tokens_title' == 0 or ' n_tokens_content' == 0
df.loc[(df[' n_tokens_title'] == 0) | (df[' n_tokens_content'] == 0)]
| url | timedelta | n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | num_videos | average_token_length | num_keywords | data_channel_is_lifestyle | data_channel_is_entertainment | data_channel_is_bus | data_channel_is_socmed | data_channel_is_tech | data_channel_is_world | kw_min_min | kw_max_min | kw_avg_min | kw_min_max | kw_max_max | kw_avg_max | kw_min_avg | kw_max_avg | kw_avg_avg | self_reference_min_shares | self_reference_max_shares | self_reference_avg_sharess | weekday_is_monday | weekday_is_tuesday | weekday_is_wednesday | weekday_is_thursday | weekday_is_friday | weekday_is_saturday | weekday_is_sunday | is_weekend | LDA_00 | LDA_01 | LDA_02 | LDA_03 | LDA_04 | global_subjectivity | global_sentiment_polarity | global_rate_positive_words | global_rate_negative_words | rate_positive_words | rate_negative_words | avg_positive_polarity | min_positive_polarity | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | shares | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 893 | http://mashable.com/2013/01/23/actual-facebook... | 715.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 217.0 | 1200.0 | 542.500000 | 0.0 | 51900.0 | 16928.571429 | 0.000000 | 3647.272727 | 1923.430603 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.191383 | 0.028703 | 0.179282 | 0.572047 | 0.028586 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.550000 | -0.500000 | 0.050000 | 0.500000 | 2500 |
| 917 | http://mashable.com/2013/01/23/fitness-gadget-... | 715.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 217.0 | 2000.0 | 582.285714 | 0.0 | 51900.0 | 16080.000000 | 0.000000 | 2939.239130 | 1596.149445 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.020362 | 0.133522 | 0.020019 | 0.020554 | 0.805543 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1700 |
| 1062 | http://mashable.com/2013/01/25/data-vs-nature-... | 713.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 217.0 | 2500.0 | 669.000000 | 0.0 | 51900.0 | 17255.555556 | 0.000000 | 4380.000000 | 2328.023284 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.532843 | 0.022387 | 0.260606 | 0.023187 | 0.160977 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 4500 |
| 1121 | http://mashable.com/2013/01/26/infographics-ma... | 712.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 217.0 | 1700.0 | 611.000000 | 0.0 | 51900.0 | 13628.571429 | 0.000000 | 4408.000000 | 2332.097082 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.885229 | 0.028718 | 0.028587 | 0.028866 | 0.028599 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.066667 | 0.033333 | 0.433333 | 0.033333 | 6800 |
| 1312 | http://mashable.com/2013/01/29/social-tv-chart... | 709.0 | 14.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 217.0 | 1700.0 | 517.800000 | 440.0 | 51900.0 | 17628.000000 | 440.000000 | 4423.333333 | 2305.412167 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.020977 | 0.020356 | 0.020075 | 0.918569 | 0.020023 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.283333 | 0.266667 | 0.216667 | 0.266667 | 1000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39598 | http://mashable.com/2014/12/26/holiday-decorat... | 9.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 1000.0 | 199.200000 | 0.0 | 843300.0 | 457883.333333 | 0.000000 | 5829.174629 | 4067.441144 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.033336 | 0.033444 | 0.033335 | 0.866552 | 0.033334 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.333333 | 0.250000 | 0.166667 | 0.250000 | 6100 |
| 39601 | http://mashable.com/2014/12/26/minority-author... | 9.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 426.0 | 126.227143 | 211600.0 | 843300.0 | 586742.857143 | 3385.393320 | 7519.376771 | 6133.006554 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028572 | 0.028582 | 0.028571 | 0.885704 | 0.028571 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.625000 | 0.400000 | 0.125000 | 0.400000 | 2300 |
| 39613 | http://mashable.com/2014/12/26/the-interview-b... | 9.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 14.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -1.0 | 397.0 | 57.900000 | 2300.0 | 843300.0 | 258400.000000 | 1079.714286 | 4032.469314 | 2575.552255 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.121078 | 0.120580 | 0.718204 | 0.020110 | 0.020028 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1600 |
| 39615 | http://mashable.com/2014/12/26/toothpaste-fluo... | 9.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21.0 | 1.0 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -1.0 | 1100.0 | 281.300000 | 17100.0 | 843300.0 | 194840.000000 | 2141.095781 | 5471.574662 | 3605.376162 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.020322 | 0.020021 | 0.499228 | 0.440050 | 0.020378 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.454545 | 0.136364 | 0.045455 | 0.136364 | 5700 |
| 39616 | http://mashable.com/2014/12/26/top-photography... | 9.0 | 15.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 1100.0 | 261.666667 | 9100.0 | 843300.0 | 433366.666667 | 2005.754386 | 5829.174629 | 3835.453639 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.033414 | 0.033347 | 0.033459 | 0.866389 | 0.033390 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.454545 | 0.136364 | 0.045455 | 0.136364 | 2100 |
1181 rows × 61 columns
# create a mask where either ' n_tokens_title' or ' n_tokens_content' is 0 and other 3 columns are 0 (link/image/video)
mask = (df[' n_tokens_title'] == 0) | (df[' n_tokens_content'] == 0) & ((df[' num_hrefs'] == 0) & (df[' num_imgs'] == 0) & (df[' num_videos'] == 0))
# use the mask to drop the rows
df = df[~mask]
What we have done so far here:
We drop the first two columns, url and timedelta. The first one is the link to the article, and the second one is the time between the article publication and the dataset acquisition. These two columns are not useful for our analysis.
# drop the first two columns
df.drop(columns=['url', ' timedelta'], inplace=True) # beware of the space in some column names
In our dataset, we have some columns with spaces at the begginning of their names. We remove these spaces.
# rename the columns
for col in df.columns:
df.rename(columns={col: col.strip()}, inplace=True)
# check the columns
df.columns
Index(['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs',
'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length',
'num_keywords', 'data_channel_is_lifestyle',
'data_channel_is_entertainment', 'data_channel_is_bus',
'data_channel_is_socmed', 'data_channel_is_tech',
'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
'global_sentiment_polarity', 'global_rate_positive_words',
'global_rate_negative_words', 'rate_positive_words',
'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
'max_positive_polarity', 'avg_negative_polarity',
'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
'title_sentiment_polarity', 'abs_title_subjectivity',
'abs_title_sentiment_polarity', 'shares'],
dtype='object')
is_saturday, is_sunday and is_weekend¶# show correlation matrix between days
sns.heatmap(df[['weekday_is_saturday', 'weekday_is_sunday', 'is_weekend']].corr(), annot=True, cmap='coolwarm')
plt.xticks(rotation=45)
plt.show(),
(None,)
Here, we obviously found that the correlation between weekday_is_saturday/weekday_is_sunday and is_weekend is high because when it's the weekend, it's either saturday or sunday. We think that the column is_weekend is not necessary.
# drop the column ' is_weekend'
df.drop(columns=['is_weekend'], inplace=True)
What we have done so far:
What we are going to do:
# new column 'Day' using one-hot encoded columns 'weekday_is_ ...'
def to_day(row):
if row['weekday_is_monday'] == 1:
return 'Monday'
elif row['weekday_is_tuesday'] == 1:
return 'Tuesday'
elif row['weekday_is_wednesday'] == 1:
return 'Wednesday'
elif row['weekday_is_thursday'] == 1:
return 'Thursday'
elif row['weekday_is_friday'] == 1:
return 'Friday'
elif row['weekday_is_saturday'] == 1:
return 'Saturday'
elif row['weekday_is_sunday'] == 1:
return 'Sunday'
else:
return 'No day'
df['Day'] = df.apply(to_day, axis=1)
df.head()
| n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | num_videos | average_token_length | num_keywords | data_channel_is_lifestyle | data_channel_is_entertainment | data_channel_is_bus | data_channel_is_socmed | data_channel_is_tech | data_channel_is_world | kw_min_min | kw_max_min | kw_avg_min | kw_min_max | kw_max_max | kw_avg_max | kw_min_avg | kw_max_avg | kw_avg_avg | self_reference_min_shares | self_reference_max_shares | self_reference_avg_sharess | weekday_is_monday | weekday_is_tuesday | weekday_is_wednesday | weekday_is_thursday | weekday_is_friday | weekday_is_saturday | weekday_is_sunday | LDA_00 | LDA_01 | LDA_02 | LDA_03 | LDA_04 | global_subjectivity | global_sentiment_polarity | global_rate_positive_words | global_rate_negative_words | rate_positive_words | rate_negative_words | avg_positive_polarity | min_positive_polarity | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | shares | Day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12.0 | 219.0 | 0.663594 | 1.0 | 0.815385 | 4.0 | 2.0 | 1.0 | 0.0 | 4.680365 | 5.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 496.0 | 496.0 | 496.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500331 | 0.378279 | 0.040005 | 0.041263 | 0.040123 | 0.521617 | 0.092562 | 0.045662 | 0.013699 | 0.769231 | 0.230769 | 0.378636 | 0.100000 | 0.7 | -0.350000 | -0.600 | -0.200000 | 0.500000 | -0.187500 | 0.000000 | 0.187500 | 593 | Monday |
| 1 | 9.0 | 255.0 | 0.604743 | 1.0 | 0.791946 | 3.0 | 1.0 | 1.0 | 0.0 | 4.913725 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.799756 | 0.050047 | 0.050096 | 0.050101 | 0.050001 | 0.341246 | 0.148948 | 0.043137 | 0.015686 | 0.733333 | 0.266667 | 0.286915 | 0.033333 | 0.7 | -0.118750 | -0.125 | -0.100000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 711 | Monday |
| 2 | 9.0 | 211.0 | 0.575130 | 1.0 | 0.663866 | 3.0 | 1.0 | 1.0 | 0.0 | 4.393365 | 6.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 918.0 | 918.0 | 918.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.217792 | 0.033334 | 0.033351 | 0.033334 | 0.682188 | 0.702222 | 0.323333 | 0.056872 | 0.009479 | 0.857143 | 0.142857 | 0.495833 | 0.100000 | 1.0 | -0.466667 | -0.800 | -0.133333 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1500 | Monday |
| 3 | 9.0 | 531.0 | 0.503788 | 1.0 | 0.665635 | 9.0 | 0.0 | 1.0 | 0.0 | 4.404896 | 7.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028573 | 0.419300 | 0.494651 | 0.028905 | 0.028572 | 0.429850 | 0.100705 | 0.041431 | 0.020716 | 0.666667 | 0.333333 | 0.385965 | 0.136364 | 0.8 | -0.369697 | -0.600 | -0.166667 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1200 | Monday |
| 4 | 13.0 | 1072.0 | 0.415646 | 1.0 | 0.540890 | 19.0 | 19.0 | 20.0 | 0.0 | 4.682836 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 545.0 | 16000.0 | 3151.157895 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028633 | 0.028794 | 0.028575 | 0.028572 | 0.885427 | 0.513502 | 0.281003 | 0.074627 | 0.012127 | 0.860215 | 0.139785 | 0.411127 | 0.033333 | 1.0 | -0.220192 | -0.500 | -0.050000 | 0.454545 | 0.136364 | 0.045455 | 0.136364 | 505 | Monday |
# the number of articles per day
sns.countplot(x='Day', data=df)
plt.title('Number of articles per day')
plt.show()
# now the shares per day
sns.barplot(x='Day', y='shares', data=df)
plt.title('Shares per day')
plt.show()
# drop column newly created 'Day'
df.drop(columns=['Day'], inplace=True)
# new column 'Subject' using one-hot encoded columns 'data_channel_is_ ...'
def to_subject(row):
if row['data_channel_is_lifestyle'] == 1:
return 'Lifestyle'
elif row['data_channel_is_entertainment'] == 1:
return 'Entertainment'
elif row['data_channel_is_bus'] == 1:
return 'Business'
elif row['data_channel_is_socmed'] == 1:
return 'Social Media'
elif row['data_channel_is_tech'] == 1:
return 'Tech'
elif row['data_channel_is_world'] == 1:
return 'World'
else:
return 'No subject'
df['Subject'] = df.apply(to_subject, axis=1)
df.head()
| n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | num_videos | average_token_length | num_keywords | data_channel_is_lifestyle | data_channel_is_entertainment | data_channel_is_bus | data_channel_is_socmed | data_channel_is_tech | data_channel_is_world | kw_min_min | kw_max_min | kw_avg_min | kw_min_max | kw_max_max | kw_avg_max | kw_min_avg | kw_max_avg | kw_avg_avg | self_reference_min_shares | self_reference_max_shares | self_reference_avg_sharess | weekday_is_monday | weekday_is_tuesday | weekday_is_wednesday | weekday_is_thursday | weekday_is_friday | weekday_is_saturday | weekday_is_sunday | LDA_00 | LDA_01 | LDA_02 | LDA_03 | LDA_04 | global_subjectivity | global_sentiment_polarity | global_rate_positive_words | global_rate_negative_words | rate_positive_words | rate_negative_words | avg_positive_polarity | min_positive_polarity | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | shares | Subject | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12.0 | 219.0 | 0.663594 | 1.0 | 0.815385 | 4.0 | 2.0 | 1.0 | 0.0 | 4.680365 | 5.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 496.0 | 496.0 | 496.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500331 | 0.378279 | 0.040005 | 0.041263 | 0.040123 | 0.521617 | 0.092562 | 0.045662 | 0.013699 | 0.769231 | 0.230769 | 0.378636 | 0.100000 | 0.7 | -0.350000 | -0.600 | -0.200000 | 0.500000 | -0.187500 | 0.000000 | 0.187500 | 593 | Entertainment |
| 1 | 9.0 | 255.0 | 0.604743 | 1.0 | 0.791946 | 3.0 | 1.0 | 1.0 | 0.0 | 4.913725 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.799756 | 0.050047 | 0.050096 | 0.050101 | 0.050001 | 0.341246 | 0.148948 | 0.043137 | 0.015686 | 0.733333 | 0.266667 | 0.286915 | 0.033333 | 0.7 | -0.118750 | -0.125 | -0.100000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 711 | Business |
| 2 | 9.0 | 211.0 | 0.575130 | 1.0 | 0.663866 | 3.0 | 1.0 | 1.0 | 0.0 | 4.393365 | 6.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 918.0 | 918.0 | 918.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.217792 | 0.033334 | 0.033351 | 0.033334 | 0.682188 | 0.702222 | 0.323333 | 0.056872 | 0.009479 | 0.857143 | 0.142857 | 0.495833 | 0.100000 | 1.0 | -0.466667 | -0.800 | -0.133333 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1500 | Business |
| 3 | 9.0 | 531.0 | 0.503788 | 1.0 | 0.665635 | 9.0 | 0.0 | 1.0 | 0.0 | 4.404896 | 7.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028573 | 0.419300 | 0.494651 | 0.028905 | 0.028572 | 0.429850 | 0.100705 | 0.041431 | 0.020716 | 0.666667 | 0.333333 | 0.385965 | 0.136364 | 0.8 | -0.369697 | -0.600 | -0.166667 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 1200 | Entertainment |
| 4 | 13.0 | 1072.0 | 0.415646 | 1.0 | 0.540890 | 19.0 | 19.0 | 20.0 | 0.0 | 4.682836 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 545.0 | 16000.0 | 3151.157895 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028633 | 0.028794 | 0.028575 | 0.028572 | 0.885427 | 0.513502 | 0.281003 | 0.074627 | 0.012127 | 0.860215 | 0.139785 | 0.411127 | 0.033333 | 1.0 | -0.220192 | -0.500 | -0.050000 | 0.454545 | 0.136364 | 0.045455 | 0.136364 | 505 | Tech |
# the number of articles per subject
sns.countplot(x='Subject', data=df)
plt.title('Number of articles per subject')
plt.show()
# now the shares per subject
sns.barplot(x='Subject', y='shares', data=df)
plt.title('Shares per subject')
plt.show()
fig = px.scatter(df, x="n_tokens_content", y="shares", title="Link between n_tokens_content (number of words) and shares", width=2000, height=700)
fig.show()
Here, we see that the number of words have a influence on the number of shares. And not in the way we thought. An article with many words is not necessarily shared more than an article with few words. We can see that the articles with the most shares have between 0 and 500 words.
Important to note:
# subjectivity
fig = px.scatter(df, x="global_subjectivity", y="shares", title="Link between global_subjectivity and shares", width=2000, height=700)
fig.show()
Here, concerning the subjectivity in the articles, it seems that the distribution follows a normal distribution.
# polarity
fig = px.scatter(df, x="global_sentiment_polarity", y="shares", title="Link between global_sentiment_polarity and shares", width=2000, height=700)
fig.show()
Same thing here for the polarity in the articles. We can note that the distribution is more centered around 0 (because the articles polarity can be negative) but not exactly, it seems that the polarity is more positive than negative.
df_wo_nosubj = df[df['Subject'] != 'No subject'] # takes out the 'No subject' value, as we can't interpret them
sns.barplot(x='Subject', y='abs_title_subjectivity', data = df_wo_nosubj)
plt.title('Absolute title subjectivity per subject')
Text(0.5, 1.0, 'Absolute title subjectivity per subject')
This plot shows us that World, Lifestyle and Social Media have the most absolute subjective titles. However, there is a very little disparity between subjects.
sns.barplot(x='Subject', y='title_subjectivity', data = df_wo_nosubj)
plt.title('Felt title subjectivity per subject')
Text(0.5, 1.0, 'Felt title subjectivity per subject')
When taking the 'felt' title subjectity, disparities are much higher. The Entertainment section of the website has as much higher title subjectivity than the other ones. In second place is LifeStyle, which is also a lot higher. All the other ones are close to each other.
An important thing to note is that the two lowest are the World and Business subjects, which include elements of politics. Given the usual subjecitivty of this matter, it is quite surprising to see a low subjectivity title score. This can mean that the article on Mashable on World and Business are quite objective.
sns.barplot(x='Subject', y='global_subjectivity', data = df_wo_nosubj)
plt.title('Content subjectivity per subject')
Text(0.5, 1.0, 'Content subjectivity per subject')
When taking solely the subjectivty of the content (as opposed to the titles before), World remains by far the most objective. Business is still the second least subjective.
And Lifestyle, which had the second most subjective titles, has the most subjective content.
sns.barplot(x='Subject', y='global_sentiment_polarity', data = df_wo_nosubj)
plt.title('Content polarity per subject')
Text(0.5, 1.0, 'Content polarity per subject')
# drop column newly created 'Subject'
df.drop(columns=['Subject'], inplace=True)
avg_kw_plt = df[['kw_min_avg','kw_max_avg','kw_avg_avg']]
plt.figure(figsize=(50,40))
plt.plot(avg_kw_plt)
plt.title('Average keyword (min, max, avg) popularity over time', fontsize=35)
plt.tick_params(labelsize=20)
plt.show()
Link between the subjectivity of the title and the number of shares.
sns.heatmap(df[['abs_title_subjectivity','title_subjectivity', 'shares']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between title subjectivity and shares')
plt.xticks(rotation=45)
plt.show()
This heatmap proves two things:
Link between the title length and the number of shares.
plt.figure(figsize=(20,10))
sns.countplot(x='n_tokens_title', data=df)
plt.title('Number of words in title')
plt.show()
Link between the number of videos/images and number of shares.
sns.heatmap(df[['shares','num_imgs', 'num_videos']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between number of images/videos and shares')
plt.xticks(rotation=45)
plt.show()
Link between positivity/negativity and shares.
sns.heatmap(df[['shares','global_rate_positive_words', 'global_rate_negative_words' ]].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between positive/negative words and shares')
plt.xticks(rotation=45)
plt.show(),
(None,)
Negative articles are 10 times more likely to be shared. But the correlation between the positivity or negativity of the article and the amount of shares is still very low.
from sklearn.model_selection import train_test_split
# import the scaler
from sklearn.preprocessing import StandardScaler
# import the grid search
from sklearn.model_selection import GridSearchCV
# import the models
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# import the metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
Here, we define some functions that we will use later, and may be really useful.
First, we define two new variables:
X: the features;Y: the target.# split the dataframe into two variables
X = df.drop(columns=['shares']) # features
Y = df['shares'] # target
# transform the task into a binary task using a decision threshold of 1400
Y = Y.apply(lambda x: 1 if x >= 1400 else 0)
X
| n_tokens_title | n_tokens_content | n_unique_tokens | n_non_stop_words | n_non_stop_unique_tokens | num_hrefs | num_self_hrefs | num_imgs | num_videos | average_token_length | num_keywords | data_channel_is_lifestyle | data_channel_is_entertainment | data_channel_is_bus | data_channel_is_socmed | data_channel_is_tech | data_channel_is_world | kw_min_min | kw_max_min | kw_avg_min | kw_min_max | kw_max_max | kw_avg_max | kw_min_avg | kw_max_avg | kw_avg_avg | self_reference_min_shares | self_reference_max_shares | self_reference_avg_sharess | weekday_is_monday | weekday_is_tuesday | weekday_is_wednesday | weekday_is_thursday | weekday_is_friday | weekday_is_saturday | weekday_is_sunday | LDA_00 | LDA_01 | LDA_02 | LDA_03 | LDA_04 | global_subjectivity | global_sentiment_polarity | global_rate_positive_words | global_rate_negative_words | rate_positive_words | rate_negative_words | avg_positive_polarity | min_positive_polarity | max_positive_polarity | avg_negative_polarity | min_negative_polarity | max_negative_polarity | title_subjectivity | title_sentiment_polarity | abs_title_subjectivity | abs_title_sentiment_polarity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12.0 | 219.0 | 0.663594 | 1.0 | 0.815385 | 4.0 | 2.0 | 1.0 | 0.0 | 4.680365 | 5.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 496.0 | 496.0 | 496.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.500331 | 0.378279 | 0.040005 | 0.041263 | 0.040123 | 0.521617 | 0.092562 | 0.045662 | 0.013699 | 0.769231 | 0.230769 | 0.378636 | 0.100000 | 0.70 | -0.350000 | -0.600 | -0.200000 | 0.500000 | -0.187500 | 0.000000 | 0.187500 |
| 1 | 9.0 | 255.0 | 0.604743 | 1.0 | 0.791946 | 3.0 | 1.0 | 1.0 | 0.0 | 4.913725 | 4.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.799756 | 0.050047 | 0.050096 | 0.050101 | 0.050001 | 0.341246 | 0.148948 | 0.043137 | 0.015686 | 0.733333 | 0.266667 | 0.286915 | 0.033333 | 0.70 | -0.118750 | -0.125 | -0.100000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 |
| 2 | 9.0 | 211.0 | 0.575130 | 1.0 | 0.663866 | 3.0 | 1.0 | 1.0 | 0.0 | 4.393365 | 6.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 918.0 | 918.0 | 918.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.217792 | 0.033334 | 0.033351 | 0.033334 | 0.682188 | 0.702222 | 0.323333 | 0.056872 | 0.009479 | 0.857143 | 0.142857 | 0.495833 | 0.100000 | 1.00 | -0.466667 | -0.800 | -0.133333 | 0.000000 | 0.000000 | 0.500000 | 0.000000 |
| 3 | 9.0 | 531.0 | 0.503788 | 1.0 | 0.665635 | 9.0 | 0.0 | 1.0 | 0.0 | 4.404896 | 7.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028573 | 0.419300 | 0.494651 | 0.028905 | 0.028572 | 0.429850 | 0.100705 | 0.041431 | 0.020716 | 0.666667 | 0.333333 | 0.385965 | 0.136364 | 0.80 | -0.369697 | -0.600 | -0.166667 | 0.000000 | 0.000000 | 0.500000 | 0.000000 |
| 4 | 13.0 | 1072.0 | 0.415646 | 1.0 | 0.540890 | 19.0 | 19.0 | 20.0 | 0.0 | 4.682836 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 545.0 | 16000.0 | 3151.157895 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.028633 | 0.028794 | 0.028575 | 0.028572 | 0.885427 | 0.513502 | 0.281003 | 0.074627 | 0.012127 | 0.860215 | 0.139785 | 0.411127 | 0.033333 | 1.00 | -0.220192 | -0.500 | -0.050000 | 0.454545 | 0.136364 | 0.045455 | 0.136364 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39639 | 11.0 | 346.0 | 0.529052 | 1.0 | 0.684783 | 9.0 | 7.0 | 1.0 | 1.0 | 4.523121 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.0 | 671.0 | 173.125 | 26900.0 | 843300.0 | 374962.500000 | 2514.742857 | 4004.342857 | 3031.115764 | 11400.0 | 48000.0 | 37033.333333 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.025038 | 0.025001 | 0.151701 | 0.025000 | 0.773260 | 0.482679 | 0.141964 | 0.037572 | 0.014451 | 0.722222 | 0.277778 | 0.333791 | 0.100000 | 0.75 | -0.260000 | -0.500 | -0.125000 | 0.100000 | 0.000000 | 0.400000 | 0.000000 |
| 39640 | 12.0 | 328.0 | 0.696296 | 1.0 | 0.885057 | 9.0 | 7.0 | 3.0 | 48.0 | 4.405488 | 7.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | -1.0 | 616.0 | 184.000 | 6500.0 | 843300.0 | 192985.714286 | 1664.267857 | 5470.168651 | 3411.660830 | 2100.0 | 2100.0 | 2100.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.029349 | 0.028575 | 0.231866 | 0.681635 | 0.028575 | 0.564374 | 0.194249 | 0.039634 | 0.009146 | 0.812500 | 0.187500 | 0.374825 | 0.136364 | 0.70 | -0.211111 | -0.400 | -0.100000 | 0.300000 | 1.000000 | 0.200000 | 1.000000 |
| 39641 | 10.0 | 442.0 | 0.516355 | 1.0 | 0.644128 | 24.0 | 1.0 | 12.0 | 1.0 | 5.076923 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 691.0 | 168.250 | 6200.0 | 843300.0 | 295850.000000 | 1753.882353 | 6880.687034 | 4206.439195 | 1400.0 | 1400.0 | 1400.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.159004 | 0.025025 | 0.025207 | 0.643794 | 0.146970 | 0.510296 | 0.024609 | 0.033937 | 0.024887 | 0.576923 | 0.423077 | 0.307273 | 0.136364 | 0.50 | -0.356439 | -0.800 | -0.166667 | 0.454545 | 0.136364 | 0.045455 | 0.136364 |
| 39642 | 6.0 | 682.0 | 0.539493 | 1.0 | 0.692661 | 10.0 | 1.0 | 1.0 | 0.0 | 4.975073 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -1.0 | 0.0 | -1.000 | 0.0 | 843300.0 | 254600.000000 | 0.000000 | 3384.316871 | 1777.895883 | 452.0 | 452.0 | 452.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.040004 | 0.040003 | 0.839987 | 0.040002 | 0.040004 | 0.358578 | -0.008066 | 0.020528 | 0.023460 | 0.466667 | 0.533333 | 0.236851 | 0.062500 | 0.50 | -0.205246 | -0.500 | -0.012500 | 0.000000 | 0.000000 | 0.500000 | 0.000000 |
| 39643 | 10.0 | 157.0 | 0.701987 | 1.0 | 0.846154 | 1.0 | 1.0 | 0.0 | 2.0 | 4.471338 | 4.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 | 97.0 | 23.500 | 205600.0 | 843300.0 | 366200.000000 | 3035.080555 | 3613.512953 | 3296.909481 | 2100.0 | 2100.0 | 2100.000000 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.050001 | 0.799339 | 0.050000 | 0.050659 | 0.050001 | 0.517893 | 0.104892 | 0.063694 | 0.012739 | 0.833333 | 0.166667 | 0.247338 | 0.100000 | 0.50 | -0.200000 | -0.200 | -0.200000 | 0.333333 | 0.250000 | 0.166667 | 0.250000 |
39543 rows × 57 columns
Y
0 0
1 0
2 1
3 0
4 0
..
39639 1
39640 1
39641 1
39642 0
39643 0
Name: shares, Length: 39543, dtype: int64
Now we split the data into training and testing sets. We use 80% of the data for training and 20% for testing.
# split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # use 20% of the data for testing
Here, we gonna plot a correlation matrix to see if there is any correlation between the features. If so, we will drop the feature in question.
Reminder: columns with a correlation coefficient near 0 are not correlated, and columns with a correlation coefficient near 1 are highly correlated.
We suppose that the correlation coefficient threshold is 0.85. If the correlation coefficient is greater than 0.85, we drop the feature in question.
# plot the correlation matrix
plt.figure(figsize=(27,27))
sns.heatmap(X_train.corr(), annot=True, cmap='coolwarm')
plt.show()
The features that we will drop are:
n_non_stop_words;n_non_stop_unique_tokens;kw_avg_min;self_reference_avg_shares.# columns to drop
col_drop = ['n_non_stop_words',
'n_non_stop_unique_tokens',
'kw_avg_min',
'self_reference_min_shares']
# drop the columns
X_train = X_train.drop(columns=col_drop)
X_test = X_test.drop(columns=col_drop)
Let's see the distribution of each feature.
# plot the distribution of each feature
fig, ax = plt.subplots(figsize=(25, 30))
fig.suptitle('Features distribution')
for i, col in enumerate(X_train.columns):
plt.subplot(9, 6, i+1)
sns.distplot(X_train[col], kde=False)
plt.show()
Now we define a variable col_encoded that contains the names of the columns that are encoded. We will not use theses columns for scaling. In fact, binary columns should generally not be standardized as it can alter their original binary interpretation (0 or 1) and introduce a level of continuity that was not present initially.
# columns encoded
col_encoded = ['data_channel_is_lifestyle',
'data_channel_is_entertainment',
'data_channel_is_bus',
'data_channel_is_socmed',
'data_channel_is_tech',
'data_channel_is_world',
'weekday_is_monday',
'weekday_is_tuesday',
'weekday_is_wednesday',
'weekday_is_thursday',
'weekday_is_friday',
'weekday_is_saturday',
'weekday_is_sunday']
# columns to scale
col_scaled = X_train.drop(columns=col_encoded).columns
# scale the numerical columns (not the ones encoded)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns=col_encoded))
X_test_scaled = scaler.transform(X_test.drop(columns=col_encoded))
We create two functions, one to evaluate a regression model, and one to evaluate a classification model. Then, we create a function to find the best parameters using GridSearchCV on any model.
# function to evaluate a regression model
def evaluate_regression_model(model):
# fit the model
model.fit(X_train_scaled, Y_train)
# predict the target
Y_pred = model.predict(X_test_scaled)
# convert Y_pred to binary
Y_pred = np.where(Y_pred >= 0.5, 1, 0)
# calculate the metrics
accuracy = accuracy_score(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
return Y_pred, accuracy, mse, r2
# function to print the regression results
def print_regression_results(accuracy, mse, r2):
print('Accuracy: ', accuracy)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# function to evluate a classification model
def evaluate_classification_model(model):
# fit the model
model.fit(X_train_scaled, Y_train)
# predict the target
Y_pred = model.predict(X_test_scaled)
# calculate the metrics
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
confusion = confusion_matrix(Y_test, Y_pred)
return Y_pred, accuracy, precision, confusion
# function to print the classification results
def print_classification_results(accuracy, precision, confusion):
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Confusion matrix: \n', confusion)
# fucntion to find the best parameters for the model
def find_best_params(model, params):
# create the grid search
grid_search = GridSearchCV(model, params, cv=2, scoring='accuracy') # with 5-fold cross validation
# fit the grid search
grid_search.fit(X_train_scaled, Y_train)
return grid_search.best_params_
Here, we create a dictionary to store the accuracy results of each model. We will use it later to compare the models.
# create the dictionary
accuracies = {}
Now, let's do some supervised machine learning. We will use the following algorithms:
What's the difference between regression and classification algorithms? Regression algorithms are used to predict a continuous value. Classification algorithms are used to predict a discrete value. For now, we kept Y as a continuous value, so we will use regression algorithms. Later, we will decide a threshold to convert Y into a discrete value and use classification algorithms.
# evaluate the model
Y_pred, accuracy, mse, r2 = evaluate_regression_model(LinearRegression())
# print the results
print_regression_results(accuracy, mse, r2)
# add the accuracy to the dictionary
accuracies['LinearRegression'] = accuracy
Accuracy: 0.6258692628650904 Mean Squared Error: 0.3741307371349096 R-squared: -0.5020323849169899
# evaluate the model
Y_pred, accuracy, mse, r2 = evaluate_regression_model(Lasso())
# print the results
print_regression_results(accuracy, mse, r2)
# add the accuracy to the dictionary
accuracies['Lasso'] = accuracy
Accuracy: 0.5302819572638766 Mean Squared Error: 0.4697180427361234 R-squared: -0.8857892226990933
# evaluate the model
Y_pred, accuracy, precision, confusion = evaluate_classification_model(KNeighborsClassifier())
# print the results
print_classification_results(accuracy, precision, confusion)
# add the accuracy to the dictionary
accuracies['KNeighborsClassifier'] = accuracy
Accuracy: 0.5875584776836515 Precision: 0.6125060357315306 Confusion matrix: [[2110 1605] [1657 2537]]
# evaluate the model
Y_pred, accuracy, precision, confusion = evaluate_classification_model(SVC())
# print the results
print_classification_results(accuracy, precision, confusion)
# add the accuracy to the dictionary
accuracies['SVC'] = accuracy
Accuracy: 0.639903906941459 Precision: 0.6449181739879414 Confusion matrix: [[2066 1649] [1199 2995]]
Let's do a grid search on the SVC model to find the best parameters. It is the best model for now.
# search the best parameters for the model
params = {'C': [1, 10, 100],
'gamma': [0.1, 0.01, 0.001]}
best_params = find_best_params(SVC(), params)
# print the best parameters
print(best_params)
{'C': 10, 'gamma': 0.01}
# evaluate the model
Y_pred, accuracy, precision, confusion = evaluate_classification_model(SVC(C=best_params['C'], gamma=best_params['gamma']))
# print the results
print_classification_results(accuracy, precision, confusion)
# add the accuracy to the dictionary
accuracies['SVC_gridsearch'] = accuracy
Accuracy: 0.6392717157668479 Precision: 0.6479152878888154 Confusion matrix: [[2119 1596] [1257 2937]]
# evaluate the model
Y_pred, accuracy, precision, confusion = evaluate_classification_model(DecisionTreeClassifier())
# print the results
print_classification_results(accuracy, precision, confusion)
# add the accuracy to the dictionary
accuracies['DecisionTreeClassifier'] = accuracy
Accuracy: 0.5737767100771273 Precision: 0.5976738666033705 Confusion matrix: [[2020 1695] [1676 2518]]
# evaluate the model
Y_pred, accuracy, precision, confusion = evaluate_classification_model(RandomForestClassifier())
# print the results
print_classification_results(accuracy, precision, confusion)
# add the accuracy to the dictionary
accuracies['RandomForestClassifier'] = accuracy
Accuracy: 0.6371222657731698 Precision: 0.6433521004763967 Confusion matrix: [[2068 1647] [1223 2971]]
Let's do a grid search on the RandomForestClassifier model to find the best parameters. It is the new best model for now.
# search the best parameters for the model
params = {'n_estimators': [50, 100, 200],
'max_depth': [10, 20, 30]}
best_params = find_best_params(RandomForestClassifier(), params)
# print the best parameters
print(best_params)
{'max_depth': 30, 'n_estimators': 200}
# evaluate the model
Y_pred, accuracy, precision, confusion = evaluate_classification_model(RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth']))
# print the results
print_classification_results(accuracy, precision, confusion)
# add the accuracy to the dictionary
accuracies['RandomForestClassifier_gridsearch'] = accuracy
Accuracy: 0.6473637628018712 Precision: 0.6501389185723445 Confusion matrix: [[2078 1637] [1152 3042]]
First, let's take a look at our data. We can see that it has already been cleaned.
Next, we import everything we need, which will come from 3 libraries: tensorflow, sklearn, scikeras.
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
from keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV
from scikeras.wrappers import KerasClassifier
Now we can split again the dataframe into input X and target Y using train_test_split. After that, we will apply a StandardScaler to the input data for standardization and scaling.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Now we're gonna use Random Search cross validation to perform hyperparameter tuning in order to determine which parameters give us the best model possible. We choose Random Search over Grid Search because the amount of parameters we're testing would make for a very long and expensive grid search, since we'd have to navigate all combinations.
First, we define a function to create the model we'll use for the Random Search and we give it default parameter values. Secondly, we define the function that will perform the Random Search with the help of the first function we built. In this second function, we first define the model to be searched as well as the dictionnary containing the arrays of parameter values we wish to test. Next, we initiate the Random Search cross validation process with 10 iterations and 3 folds and fit our training data to it. Finally, the function returns the best parameter combination and displays the scores of each tested combination. We also have a Grid Search function that works the same as our Random Search function; however, as stated before we won't use it since it would be very expensive and time consuming.
def create_model(lr_rate=0.001, dropout_rate=0.5, units=[128, 64, 32]):
model = Sequential()
model.add(Input(shape=(X_train_scaled.shape[1],)))
for unit in units:
model.add(Dense(unit, activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=lr_rate)
model.compile(
loss='binary_crossentropy',
optimizer=optimizer,
metrics=['accuracy']
)
return model
def rand_search_model():
model = KerasClassifier(build_fn=create_model, units=[128, 64, 32], dropout_rate=0.5, lr_rate=0.001, epochs=20, batch_size=32, verbose=0)
param_dist = {
'dropout_rate': [0.3, 0.5, 0.7],
'units': [[64, 32], [128, 64, 32]],
'batch_size': [16, 32, 64],
'epochs': [10, 20, 30]
}
random_search = RandomizedSearchCV(
model,
param_distributions=param_dist,
n_iter=10,
cv=3,
verbose=2,
scoring='accuracy'
)
search_result = random_search.fit(X_train_scaled, Y_train)
print("Best: %f using %r" % (search_result.best_score_, search_result.best_params_))
means = search_result.cv_results_['mean_test_score']
params = search_result.cv_results_['params']
for mean, param in zip(means, params):
print("%f with: %r" % (mean, param))
return search_result.best_params_
def grid_search_model():
model = KerasClassifier(build_fn=create_model, units=[128, 64, 32], dropout_rate=0.5, lr_rate=0.001, epochs=20, batch_size=32, verbose=0)
param_grid= {
'dropout_rate': [0.3, 0.5, 0.7],
'units': [[64, 32], [128, 64, 32]],
'batch_size': [16, 32, 64],
'epochs': [10, 20, 30]
}
grid_search = GridSearchCV(
model,
param_grid=param_grid,
cv=3,
verbose=2,
scoring='accuracy'
)
search_result = grid_search.fit(X_train_scaled, Y_train)
print("Best: %f using %r" % (search_result.best_score_, search_result.best_params_))
means = search_result.cv_results_['mean_test_score']
params = search_result.cv_results_['params']
for mean, param in zip(means, params):
print("%f with: %r" % (mean, param))
return search_result.best_params_
best_params = rand_search_model()
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END batch_size=16, dropout_rate=0.5, epochs=30, units=[128, 64, 32]; total time= 32.0s
[CV] END batch_size=16, dropout_rate=0.5, epochs=30, units=[128, 64, 32]; total time= 31.3s
[CV] END batch_size=16, dropout_rate=0.5, epochs=30, units=[128, 64, 32]; total time= 33.1s
[CV] END batch_size=32, dropout_rate=0.7, epochs=10, units=[64, 32]; total time= 5.6s
[CV] END batch_size=32, dropout_rate=0.7, epochs=10, units=[64, 32]; total time= 5.3s
[CV] END batch_size=32, dropout_rate=0.7, epochs=10, units=[64, 32]; total time= 5.4s
[CV] END batch_size=64, dropout_rate=0.3, epochs=10, units=[64, 32]; total time= 3.2s
[CV] END batch_size=64, dropout_rate=0.3, epochs=10, units=[64, 32]; total time= 3.2s
[CV] END batch_size=64, dropout_rate=0.3, epochs=10, units=[64, 32]; total time= 3.2s
[CV] END batch_size=64, dropout_rate=0.5, epochs=20, units=[64, 32]; total time= 5.7s
[CV] END batch_size=64, dropout_rate=0.5, epochs=20, units=[64, 32]; total time= 5.9s
[CV] END batch_size=64, dropout_rate=0.5, epochs=20, units=[64, 32]; total time= 5.7s
[CV] END batch_size=16, dropout_rate=0.5, epochs=30, units=[64, 32]; total time= 27.3s
[CV] END batch_size=16, dropout_rate=0.5, epochs=30, units=[64, 32]; total time= 27.5s
[CV] END batch_size=16, dropout_rate=0.5, epochs=30, units=[64, 32]; total time= 27.6s
[CV] END batch_size=16, dropout_rate=0.3, epochs=20, units=[128, 64, 32]; total time= 21.7s
[CV] END batch_size=16, dropout_rate=0.3, epochs=20, units=[128, 64, 32]; total time= 21.5s
[CV] END batch_size=16, dropout_rate=0.3, epochs=20, units=[128, 64, 32]; total time= 21.7s
[CV] END batch_size=16, dropout_rate=0.3, epochs=10, units=[64, 32]; total time= 9.8s
[CV] END batch_size=16, dropout_rate=0.3, epochs=10, units=[64, 32]; total time= 9.7s
[CV] END batch_size=16, dropout_rate=0.3, epochs=10, units=[64, 32]; total time= 9.6s
[CV] END batch_size=32, dropout_rate=0.7, epochs=10, units=[128, 64, 32]; total time= 6.6s
[CV] END batch_size=32, dropout_rate=0.7, epochs=10, units=[128, 64, 32]; total time= 6.5s
[CV] END batch_size=32, dropout_rate=0.7, epochs=10, units=[128, 64, 32]; total time= 6.6s
[CV] END batch_size=64, dropout_rate=0.3, epochs=30, units=[64, 32]; total time= 8.2s
[CV] END batch_size=64, dropout_rate=0.3, epochs=30, units=[64, 32]; total time= 8.4s
[CV] END batch_size=64, dropout_rate=0.3, epochs=30, units=[64, 32]; total time= 8.4s
[CV] END batch_size=16, dropout_rate=0.5, epochs=20, units=[64, 32]; total time= 18.4s
[CV] END batch_size=16, dropout_rate=0.5, epochs=20, units=[64, 32]; total time= 18.4s
[CV] END batch_size=16, dropout_rate=0.5, epochs=20, units=[64, 32]; total time= 18.3s
Best: 0.662547 using {'units': [64, 32], 'epochs': 10, 'dropout_rate': 0.3, 'batch_size': 16}
0.659891 with: {'units': [128, 64, 32], 'epochs': 30, 'dropout_rate': 0.5, 'batch_size': 16}
0.654517 with: {'units': [64, 32], 'epochs': 10, 'dropout_rate': 0.7, 'batch_size': 32}
0.662230 with: {'units': [64, 32], 'epochs': 10, 'dropout_rate': 0.3, 'batch_size': 64}
0.659069 with: {'units': [64, 32], 'epochs': 20, 'dropout_rate': 0.5, 'batch_size': 64}
0.659196 with: {'units': [64, 32], 'epochs': 30, 'dropout_rate': 0.5, 'batch_size': 16}
0.660081 with: {'units': [128, 64, 32], 'epochs': 20, 'dropout_rate': 0.3, 'batch_size': 16}
0.662547 with: {'units': [64, 32], 'epochs': 10, 'dropout_rate': 0.3, 'batch_size': 16}
0.654106 with: {'units': [128, 64, 32], 'epochs': 10, 'dropout_rate': 0.7, 'batch_size': 32}
0.661725 with: {'units': [64, 32], 'epochs': 30, 'dropout_rate': 0.3, 'batch_size': 64}
0.659417 with: {'units': [64, 32], 'epochs': 20, 'dropout_rate': 0.5, 'batch_size': 16}
Once we've obtained a good set of tuned parameters from the random search function, we give them to our binary_class_model function, which creates our final model with the given parameters and fits it to our training data and then evaluates it with our testing data. Our random search function consistently returns a set of parameters that on average provide a 66% accuracy. Since this is an average, when we create our actual model its accuracy will typically fluctuate between 65.5% and 66.5%, which is already a fairly good accuracy rate.
def binary_class_model(param_dict, X_tr, Y_tr, X_te, Y_te, method='random'):
params = list(param_dict.values())
if method == 'random':
units = params[0]
epochs = params[1]
dropout_rate = params[2]
batch_size = params[3]
else:
batch_size = params[0]
dropout_rate = params[1]
epochs = params[2]
units = params[3]
model = Sequential()
model.add(Input(shape=(X_train_scaled.shape[1],)))
for unit in units:
model.add(Dense(unit, activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=0.001)
model.compile(
loss='binary_crossentropy',
optimizer=optimizer,
metrics=['accuracy']
)
model.fit(X_tr, Y_tr, epochs=epochs, batch_size=batch_size, validation_data=(X_te, Y_te))
loss, accuracy = model.evaluate(X_te, Y_te)
print(f'Model loss: {loss} and accuracy: {accuracy}')
# add the accuracy to the dictionary
accuracies['NeuralNetwork'] = accuracy
return model
binary_model = binary_class_model(best_params, X_train_scaled, Y_train, X_test_scaled, Y_test)
Epoch 1/10 1978/1978 [==============================] - 2s 913us/step - loss: 0.6599 - accuracy: 0.6192 - val_loss: 0.6278 - val_accuracy: 0.6484 Epoch 2/10 1978/1978 [==============================] - 2s 869us/step - loss: 0.6317 - accuracy: 0.6507 - val_loss: 0.6224 - val_accuracy: 0.6533 Epoch 3/10 1978/1978 [==============================] - 2s 872us/step - loss: 0.6240 - accuracy: 0.6549 - val_loss: 0.6214 - val_accuracy: 0.6490 Epoch 4/10 1978/1978 [==============================] - 2s 883us/step - loss: 0.6205 - accuracy: 0.6585 - val_loss: 0.6208 - val_accuracy: 0.6510 Epoch 5/10 1978/1978 [==============================] - 2s 868us/step - loss: 0.6172 - accuracy: 0.6604 - val_loss: 0.6176 - val_accuracy: 0.6587 Epoch 6/10 1978/1978 [==============================] - 2s 871us/step - loss: 0.6126 - accuracy: 0.6645 - val_loss: 0.6163 - val_accuracy: 0.6561 Epoch 7/10 1978/1978 [==============================] - 2s 872us/step - loss: 0.6128 - accuracy: 0.6637 - val_loss: 0.6170 - val_accuracy: 0.6572 Epoch 8/10 1978/1978 [==============================] - 2s 862us/step - loss: 0.6107 - accuracy: 0.6655 - val_loss: 0.6160 - val_accuracy: 0.6615 Epoch 9/10 1978/1978 [==============================] - 2s 863us/step - loss: 0.6093 - accuracy: 0.6647 - val_loss: 0.6162 - val_accuracy: 0.6580 Epoch 10/10 1978/1978 [==============================] - 2s 869us/step - loss: 0.6079 - accuracy: 0.6657 - val_loss: 0.6164 - val_accuracy: 0.6587 248/248 [==============================] - 0s 560us/step - loss: 0.6164 - accuracy: 0.6587 Model loss: 0.6163818836212158 and accuracy: 0.6587432026863098
Now, for our Flask API, we gonna save our model to use it directly in the API (it saves time).
# save the trained model into a file to use it elsewhere
binary_model.save('binary_model.h5')
Finally, we write a function to make a prediction on a random sample from the testing set. The random index is selected from the dimensions of the testing set. Next, we reshape the sample to ensure it's in the correct dimensions to be given to the model. Since the prediction will be a probability, we compare it to 0.5 to determine the label of the prediction. Finally, we display the real label, the predicted label, and the predicted probability.
import random
def show_random_prediction(model, X_te, Y_te):
random_index = random.randint(0, len(X_te) - 1)
random_sample = X_te[random_index]
actual_label = np.array(Y_te.iloc[random_index])
random_sample = np.reshape(random_sample, (1, -1))
prediction = model.predict(random_sample)
prediction = 1 if prediction > 0.5 else 0
sigmoid_activation = model.predict(random_sample).flatten()[0]
print("Actual label:")
print(actual_label)
print("\nPredicted label:")
print(prediction)
print("\nFinal sigmoid activation:")
print(sigmoid_activation)
show_random_prediction(binary_model, X_test_scaled, Y_test)
1/1 [==============================] - 0s 35ms/step 1/1 [==============================] - 0s 14ms/step Actual label: 0 Predicted label: 1 Final sigmoid activation: 0.5148289 1/1 [==============================] - 0s 14ms/step Actual label: 0 Predicted label: 1 Final sigmoid activation: 0.5148289
Now, with all accuracy results, we can compare the models and say which is the best one. Let's do some plots to see the results.
# plot the accuracies (histogram)
px.bar(x=list(accuracies.keys()), y=list(accuracies.values()), color=list(accuracies.keys()), title='Accuracies of the models', width=1000, height=500)
# plot the accuracies (boxplot)
px.box(y=list(accuracies.values()), title='Accuracies of the models', width=600, height=400)
# plot the accuracies (scatterplot)
px.scatter(x=list(accuracies.keys()), y=list(accuracies.values()), color=list(accuracies.keys()), title='Accuracies of the models', width=1000, height=500, size=list(accuracies.values()))
Thanks to our algorithms, we finally found wich was the best model for our dataset. It is important to note that these accuracies and the performance of the models greatly depend on the dataset, the type of our data, the number of features, lines, etc... In an other module, named Green AI, we had a similar work to do, and we found that SVC was the best model.
Here, we can see that the best model is the RandomForestClassifier with an accuracy of ~0.64 (in general). We also see that SVC has a good accuracy too, compared to the others that are below 0.6 in terms of accuracy.
Then, concerning deep learning, we can see that the accuracy is ~0.66, which is better than the others. Important note: it is not always the case, sometimes, we have an accuracy of ~0.65. But it is still better than the others.
We will use what we have done with deep learning for our Flask API.